Random Forest summary

Alex Koiter

Load libraries

In [1]:

suppressPackageStartupMessages({
  library(tidyverse)
  library(patchwork)
  library(gt)
})

Warning: package 'ggplot2' was built under R version 4.4.3Warning: package 'patchwork' was built under R version 4.4.2

Importance

In [2]:

imp_data <- read_csv(here::here("./notebooks/importance_data.csv")) %>%
  rename("Terrain Attribute" = "Var.Names") %>%
  mutate(`Terrain Attribute` = fct_recode(`Terrain Attribute`, "Elevation" = "elevation", 
                                          "Catchment Area" = "catchment_area",
                                          "Plan Curvature" = "plan_curvature",
                                          "Profile Curvature" = "profile_curvature",
                                          "Rel. Slope Position" = "relative_slope_position",
                                          "SAGA Wetness Index" = "saga_wetness_index",
                                          "Vert. Dist. Channel" = "channel_network_distance")) %>%
  mutate(`Terrain Attribute` = fct_relevel(`Terrain Attribute`, "Elevation", "SAGA Wetness Index", "Rel. Slope Position", "Vert. Dist. Channel","Catchment Area", "Profile Curvature", "Plan Curvature")) %>%
  mutate(property = fct_recode(property, "italic(`a*`)" = "a_col", "italic(`b*`)" = "b_col", "italic(`c*`)" = "c_col", "italic(`h*`)" = "h_col", "italic(`x`)" = "x_col")) %>%
  mutate(property = fct_relevel(property, "Ca", "Co", "Cs", "Fe", "Li", "La", "Nb", "Ni", "Rb", "Sr", "italic(`a*`)", "italic(`b*`)", "italic(`c*`)", "italic(`h*`)", "italic(`x`)"))

Rows: 168 Columns: 7
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): Var.Names, site, property
dbl (4): %IncMSE, IncNodePurity, MSE_rank, Purity_rank

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.`summarise()` has grouped output by 'Terrain Attribute'. You can override using
the `.groups` argument.

In [3]:

p <- g2 + g1 + plot_layout(ncol = 1, heights = c(2, 10), guides = "collect") & theme(legend.position = 'right')
p
# ggsave(filename = "../Submission/Figure 4.png", plot = p, height = 130, width = 140, units = "mm", dpi = 600)

Figure 1: Heat map of the Random Forest regression results showing the ranking of the importance of terrain attributes (based on % increase in Mean Squared Error) in explaining the spatial variabilty of selected colour and geochemical properties within the agricultural and forested sites. Top panel shows an average ranking for each site and across both sites.

Performance

In [4]:

m2_performance <- read_csv(here::here("./notebooks/model_performance_data_49.csv")) |>
  mutate(property = fct_recode(property, "a*" = "a_col", "b*" = "b_col", "c*" = "c_col", "h*" = "h_col", "x" = "x_col", "Ca" = "ca", "Co" = "co", "Cs" = "cs", "Fe" = "fe", "Li" = "li", "La" = "la", "Nb" = "nb", "Ni" = "ni", "Rb" = "rb", "Sr" = "sr")) |>
  mutate(property = fct_relevel(property, "Ca", "Co", "Cs", "Fe", "Li", "La", "Nb", "Ni", "Rb", "Sr", "a*", "b*", "c*", "h*", "x")) |>
  rename("MSE_org" ="MSE", "R2_org" = "R2") |>
  arrange(property)

Rows: 24 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): site, property
dbl (2): MSE, R2

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

In [5]:

m_performance <- read_csv(here::here("./notebooks/model_performance_data.csv")) |>
  mutate(property = fct_recode(property, "a*" = "a_col", "b*" = "b_col", "c*" = "c_col", "h*" = "h_col", "x" = "x_col")) |>
  mutate(property = fct_relevel(property, "Ca", "Co", "Cs", "Fe", "Li", "La", "Nb", "Ni", "Rb", "Sr", "a*", "b*", "c*", "h*", "x")) |>
  mutate(Var_exp = Var_exp *100,
         Var_exp_test = Var_exp_test *100) |>
  arrange(property) |>
  left_join(m2_performance, by = c("site", "property"))

Rows: 22 Columns: 8
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): site, property
dbl (6): MSE, Var_exp, MSE_test, Var_exp_test, R2, mse_test

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

In [6]:

In [7]:

m_performance |>
  group_by(site) |>
  gt() |>
  tab_spanner(label = "Training",
               columns = c(Var_exp, MSE)) |>
  tab_spanner(label = "Validation",
              columns = c(Var_exp_test, MSE_test)) |>
  tab_spanner(label = "Test",
              columns = c(R2, mse_test)) |>
    tab_spanner(label = "Original",
              columns = c(R2_org, MSE_org)) |>
  cols_move_to_start(columns = property) |>
  # tab_spanner(label = "Training", columns = c("MSE", "Var_exp")) |>
  # tab_spanner(label = "Validation", columns = c("MSE_test", "Var_exp_test")) |>
  # tab_spanner(label = "Testing", columns = "R2") |>
  fmt_number(columns = c("MSE_test", "MSE", "mse_test", "MSE_org"), decimal = 2) |>
  fmt_number(columns = c("Var_exp_test", "Var_exp"), decimal = 1) |>
  fmt_number(columns = c("R2", "R2_org"), decimal = 2) |>
  tab_style(style =  cell_text(weight = "bold", align = "center"), locations =  cells_row_groups()) |>
  tab_style(style =  cell_text(align = "left"), locations =  cells_body()) |>
  tab_style(style =  cell_text(weight = "bold", align = "left"), locations =  cells_column_labels()) |>
  # cols_label(Var_exp = md("% Var<br/>Training"), Var_exp_test = md("% Var<br/>Validation"), MSE_test = md("MSE<br/>Testing"), MSE = md("MSE<br/>Training"), R2 = md("{{R^2}}<br/>Testing"), property = "Property") |>
    cols_label(Var_exp = "% Variance", Var_exp_test = "% Variance", MSE_test = "MSE", MSE = "MSE", R2 = "{{R^2}}", R2_org = "{{R^2}}", property = "Property", mse_test = "MSE", MSE_org = "MSE") |>
  row_group_order(groups = c("Agriculture", "Forest"))  |>
  tab_style(
    style = list(cell_text(style = "italic")),
    locations = cells_body(
      columns = property,
      rows = property %in% c("a*", "b*", "c*", "h*", "x"))) |>
    tab_footnote(
    footnote = "Mean square error",
    locations = cells_column_labels(columns = c("mse_test", "MSE_test", "MSE", "MSE_org"))) |>
  tab_footnote(
    footnote = "Percent variance explained",
    locations = cells_column_labels(columns = c("Var_exp", "Var_exp_test")))

Table 1: Model summary and performance statistics for the random forest regression using the training, validation, test and original (non-interpolated) data sets.

Property	Training		Validation		Test		Original
Property	% Variance¹	MSE²	% Variance¹	MSE²	R²	MSE²	R²	MSE²
Agriculture
Ca	91.6	0.37	91.8	0.36	0.91	0.38	0.95	0.23
Co	79.8	0.09	82.5	0.08	0.80	0.08	0.88	0.08
Cs	85.7	0.00	86.4	0.00	0.85	0.00	0.92	0.00
Fe	69.6	0.00	70.9	0.00	0.69	0.00	0.83	0.00
Li	59.3	0.54	59.8	0.53	0.64	0.51	0.88	0.24
La	93.0	0.05	93.1	0.04	0.93	0.05	0.96	0.03
Nb	57.3	0.00	59.1	0.00	0.55	0.00	0.71	0.00
Ni	93.1	0.34	93.7	0.33	0.93	0.34	0.95	0.25
Rb	95.3	0.73	96.1	0.64	0.95	0.79	0.98	0.39
Sr	93.5	97.22	93.6	93.97	0.93	105.59	0.97	44.77
a*	85.0	0.01	86.9	0.01	0.85	0.01	0.91	0.00
b*	72.5	0.14	75.3	0.12	0.72	0.15	0.89	0.09
c*	73.2	0.15	75.9	0.14	0.73	0.17	0.89	0.10
h*	58.3	0.00	58.6	0.00	0.56	0.00	0.73	0.00
x	73.3	0.00	73.6	0.00	0.69	0.00	0.82	0.00
Forest
Co	39.1	0.24	42.9	0.23	0.48	0.21	0.77	0.29
Cs	64.1	0.00	67.1	0.00	0.66	0.00	0.86	0.00
Li	41.3	0.28	42.0	0.28	0.46	0.28	0.66	0.26
La	43.3	1.40	47.5	1.32	0.48	1.23	0.78	0.60
Nb	55.0	0.00	55.9	0.00	0.58	0.00	0.84	0.00
Sr	59.4	29.43	59.1	29.66	0.59	29.25	0.82	13.88
h*	58.8	0.00	60.3	0.00	0.62	0.00	0.86	0.00
¹ Percent variance explained
² Mean square error